{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['predict the news category']\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "sns.set_style('darkgrid')\n",
    "import os\n",
    "print(os.listdir(\"../input\"))\n",
    "import pathlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_excel('../input/predict the news category/Data_Train.xlsx')\n",
    "test = pd.read_excel('../input/predict the news category/Data_Test.xlsx')\n",
    "sub = pd.read_excel('../input/predict the news category/Sample_submission.xlsx');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((7628, 2), (2748, 1))"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.shape, test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>STORY</th>\n",
       "      <th>SECTION</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>But the most painful was the huge reversal in ...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>How formidable is the opposition alliance amon...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               STORY  SECTION\n",
       "0  But the most painful was the huge reversal in ...        3\n",
       "1  How formidable is the opposition alliance amon...        0"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>STORY</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2019 will see gadgets like gaming smartphones ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>It has also unleashed a wave of changes in the...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               STORY\n",
       "0  2019 will see gadgets like gaming smartphones ...\n",
       "1  It has also unleashed a wave of changes in the..."
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f138a8da080>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEKCAYAAAAFJbKyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAGEZJREFUeJzt3XtQVPfdx/HPskg0Ylk1LliHMKVip6Oo9RmTIkZaLKBBCiNYm7E+huhYL4EakrRepuig9dJk0tLYSxgzKenYSzQKrdSRkYkg6aRMjA6SaqvTodFUFiuCQY3gep4/nGdbqsL+dJfD5f36C36eZb/8hsw7e3b3rMOyLEsAAPgpxO4BAAD9C+EAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGCAcAwEio3QMEw61bt+T18oZ4ADAxZIjTr+MGZDi8XkutrdfsHgMA+pUxY0b4dRynqgAARggHAMAI4QAAGCEcAAAjhAMAYIRwAACMEA4AgBHCAQAwQjgAAEYG5DvH0ftGRQyRM2yo3WPYztvxqVraOu0eAwgqwoGAcIYN1UdF8XaPYbtHC09KIhwY2DhVBQAwQjgAAEYIBwDACOEAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGCAcAwAjhAAAYIRwAACOEAwBgJGjhuHDhghYvXqy5c+cqPT1dpaWlkqRXX31VTzzxhDIzM5WZmanq6mrfbV577TWlpKQoLS1NR48e9a3X1NQoLS1NKSkpKikpCdbIAAA/BO3zOJxOp9auXauJEyeqvb1d2dnZSkxMlCQ9/fTTWrp0aZfjz549q4qKClVUVMjj8Sg3N1eHDh2SJBUVFemNN95QZGSkcnJylJycrPHjxwdrdABAN4IWDrfbLbfbLUkKDw9XbGysPB7PPY+vqqpSenq6wsLCFB0drZiYGNXX10uSYmJiFB0dLUlKT09XVVUV4QAAm/TKcxznz5/XqVOnNGXKFEnS7t27lZGRoXXr1qmtrU2S5PF4FBUV5btNZGSkPB7PPdcBAPYI+kfHXr16Vfn5+Vq/fr3Cw8P11FNPadWqVXI4HCouLtb27du1bds2WZZ1x20dDodu3bp11/XuOJ0OuVwPB+x3AEzwt4eBLqjh6OzsVH5+vjIyMpSamipJeuSRR3z/vmDBAq1YsUKSFBUVpaamJt+/eTwe36mue63fi9drqbX1WsB+D/RszJgRdo/QZ/C3h/7K3/+Og3aqyrIsbdiwQbGxscrNzfWtNzc3+74+fPiw4uLiJEnJycmqqKhQR0eHzp07p8bGRk2ePFnx8fFqbGzUuXPn1NHRoYqKCiUnJwdrbABAD4L2iOPYsWMqLy/XhAkTlJmZKUkqKCjQgQMHdPr0aUnSuHHjVFRUJEmKi4vT3Llz9eSTT8rpdKqwsFBOp1OSVFhYqGXLlsnr9So7O9sXGwBA73NYd3tyoZ/r7PRyuqCXjRkzQh8Vxds9hu0eLTypixc/sXsM4L7YfqoKADAwEQ4AgBHCAQAwQjgAAEYIBwDACOEAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGCAcAwAjhAAAYIRwAACOEAwBghHAAAIwQDgCAEcIBADBCOAAARggHAMAI4QAAGCEcAAAjhAMAYIRwAACMEA4AgBHCAQAwQjgAAEYIBwDASNDCceHCBS1evFhz585Venq6SktLJUmtra3Kzc1VamqqcnNz1dbWJkmyLEtbtmxRSkqKMjIy9OGHH/p+1v79+5WamqrU1FTt378/WCMDAPwQtHA4nU6tXbtWBw8e1O9+9zv9+te/1tmzZ1VSUqKEhARVVlYqISFBJSUlkqSamho1NjaqsrJSmzdv1qZNmyTdDs3OnTv11ltvac+ePdq5c6cvNgCA3he0cLjdbk2cOFGSFB4ertjYWHk8HlVVVSkrK0uSlJWVpcOHD0uSb93hcGjq1Km6cuWKmpubVVtbq8TERLlcLkVERCgxMVFHjx4N1tgAgB6E9sadnD9/XqdOndKUKVN06dIlud1uSbfj0tLSIknyeDyKiory3SYqKkoej+eO9cjISHk8nm7vz+l0yOV6OAi/CdAz/vYw0AU9HFevXlV+fr7Wr1+v8PDwex5nWdYdaw6H457r3fF6LbW2XjMfFvdtzJgRdo/QZ/C3h/7K3/+Og/qqqs7OTuXn5ysjI0OpqamSpNGjR6u5uVmS1NzcrFGjRkm6/QijqanJd9umpia53e471j0ej+8RCwCg9wUtHJZlacOGDYqNjVVubq5vPTk5WWVlZZKksrIyzZ49u8u6ZVk6ceKERowYIbfbrZkzZ6q2tlZtbW1qa2tTbW2tZs6cGayxAQA9CNqpqmPHjqm8vFwTJkxQZmamJKmgoEDLly/XmjVrtHfvXo0dO1bFxcWSpKSkJFVXVyslJUXDhg3T1q1bJUkul0urVq1STk6OJGn16tVyuVzBGhsA0AOHdbcnEfq5zk4v55l72ZgxI/RRUbzdY9ju0cKTunjxE7vHAO6Lv89x9MqrqgD4LzxiiIaFDbV7jD7hesenam/rtHsM/BfCAfQxw8KGKvHVRLvH6BPezXtX7SIcfQ3XqgIAGCEcAAAjhAMAYIRwAACMEA4AgBHCAQAwQjgAAEYIBwDACOEAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGCAcAwMig/SCn8M8M1bCHhtg9Rp9w/Uan2q98avcYAPqJQRuOYQ8N0f+8+KbdY/QJx176X7WLcADwD6eqAABGCAcAwAjhAAAY8SscS5Ys8WsNADDwdfvk+I0bN3T9+nVdvnxZbW1tsixLktTe3q7m5uZeGRAA0Ld0G47f/va3Ki0tVXNzs+bPn+8LR3h4uBYtWtQrAwIA+pZuw7FkyRItWbJEv/rVr7R48eLemgkA0If59T6OxYsX64MPPtDHH38sr9frW8/KygraYACAvsmvJ8dffPFF/fCHP9SxY8d08uRJnTx5Ug0NDd3eZt26dUpISNC8efN8a6+++qqeeOIJZWZmKjMzU9XV1b5/e+2115SSkqK0tDQdPXrUt15TU6O0tDSlpKSopKTE9PcDAASYX484Ghoa9Mc//lEOh8PvHzx//nx961vf0ve+970u608//bSWLl3aZe3s2bOqqKhQRUWFPB6PcnNzdejQIUlSUVGR3njjDUVGRionJ0fJyckaP36833MAAALLr0cccXFxunjxotEPnj59uiIiIvw6tqqqSunp6QoLC1N0dLRiYmJUX1+v+vp6xcTEKDo6WmFhYUpPT1dVVZXRHACAwPLrEcfly5eVnp6uyZMna8iQf18Y8Be/+IXxHe7evVtlZWWaNGmS1q5dq4iICHk8Hk2ZMsV3TGRkpDwejyQpKiqqy3p9fb3xfQIAAsevcOTl5QXkzp566imtWrVKDodDxcXF2r59u7Zt2+Z7me9/cjgcunXr1l3Xe+J0OuRyPRyQmQcL9itw2MvAYj/7Hr/C8dhjjwXkzh555BHf1wsWLNCKFSsk3X5U0dTU5Ps3j8cjt9stSfdc747Xa6m19Vq3x4wZM8Jo9oGup/3qCfv5b+xlYD3ofsJ//v7t+fUcx5e+9CVNmzZN06ZNU3x8vL74xS9q2rRpxkP957vNDx8+rLi4OElScnKyKioq1NHRoXPnzqmxsVGTJ09WfHy8Ghsbde7cOXV0dKiiokLJycnG9wsACBy/HnEcP368y/eHDx/u8bmGgoIC1dXV6fLly5o1a5by8vJUV1en06dPS5LGjRunoqIiSbeffJ87d66efPJJOZ1OFRYWyul0SpIKCwu1bNkyeb1eZWdn+2IDALDHfX2Q09e+9rUe31Pxyiuv3LG2YMGCex6/cuVKrVy58o71pKQkJSUlmQ8JAAgKv8JRWVnp+/rWrVtqaGgwek8HAGDg8Csc77zzju9rp9OpcePG6Wc/+1nQhgIA9F1+hWPbtm3BngMA0E/49aqqpqYmrV69WgkJCZoxY4by8vK6vEwWADB4+BWOdevWKTk5WUePHlVNTY2++tWvat26dcGeDQDQB/kVjpaWFmVnZys0NFShoaGaP3++Wlpagj0bAKAP8iscI0eOVHl5ubxer7xer8rLy+VyuYI9GwCgD/IrHFu3btXBgweVmJiomTNn6tChQzxhDgCDlF+vqiouLtaOHTt8l0lvbW3Vjh07iAcADEJ+PeL461//2uWzNVwul06dOhW0oQAAfZdf4bh165ba2tp837e2tnb57HEAwODh16mqZ555Rt/85jeVlpYmh8OhgwcP+i6JDgAYXPwKR1ZWliZNmqT33ntPlmVp586dfO43AAxSfl8dd/z48cQCAHB/l1UHgP5iZPgQhQ4bavcYfcLN65/qcnvnA/8cwgFgQAsdNlTVs/hMH0lKqqmWAhAOv15VBQDA/yMcAAAjhAMAYIRwAACMEA4AgBHCAQAwQjgAAEYIBwDACOEAABghHAAAI4QDAGCEcAAAjAQtHOvWrVNCQoLmzZvnW2ttbVVubq5SU1OVm5vr+1RBy7K0ZcsWpaSkKCMjQx9++KHvNvv371dqaqpSU1O1f//+YI0LAPBT0MIxf/587dq1q8taSUmJEhISVFlZqYSEBJWUlEiSampq1NjYqMrKSm3evFmbNm2SdDs0O3fu1FtvvaU9e/Zo586dXT7CFgDQ+4IWjunTpysiIqLLWlVVlbKysiTd/lTBw4cPd1l3OByaOnWqrly5oubmZtXW1ioxMVEul0sRERFKTEzU0aNHgzUyAMAPvfocx6VLl+R2uyVJbrdbLS0tkiSPx6OoqCjfcVFRUfJ4PHesR0ZGyuPx9ObIAID/0ic+yMmyrDvWHA7HPdd74nQ65HI9HJDZBgv2K3DYy8BiPwMrEPvZq+EYPXq0mpub5Xa71dzcrFGjRkm6/QijqanJd1xTU5PcbreioqJUV1fnW/d4PHrsscd6vB+v11Jr67VujxkzZsR9/hYDU0/71RP289/Yy8BiPwOru/30d6969VRVcnKyysrKJEllZWWaPXt2l3XLsnTixAmNGDFCbrdbM2fOVG1trdra2tTW1qba2lrNnDmzN0cGAPyXoD3iKCgoUF1dnS5fvqxZs2YpLy9Py5cv15o1a7R3716NHTtWxcXFkqSkpCRVV1crJSVFw4YN09atWyVJLpdLq1atUk5OjiRp9erVcrlcwRoZAOCHoIXjlVdeuet6aWnpHWsOh0MbN2686/E5OTm+cAAA7Mc7xwEARggHAMAI4QAAGCEcAAAjhAMAYIRwAACMEA4AgBHCAQAwQjgAAEYIBwDACOEAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGCAcAwAjhAAAYIRwAACOEAwBghHAAAIwQDgCAEcIBADBCOAAARggHAMAI4QAAGCEcAAAjhAMAYCTUjjtNTk7W8OHDFRISIqfTqX379qm1tVXPPfecPv74Y40bN04//vGPFRERIcuy9IMf/EDV1dUaOnSotm/frokTJ9oxNgBANj7iKC0tVXl5ufbt2ydJKikpUUJCgiorK5WQkKCSkhJJUk1NjRobG1VZWanNmzdr06ZNdo0MAFAfOlVVVVWlrKwsSVJWVpYOHz7cZd3hcGjq1Km6cuWKmpub7RwVAAY1W05VSdLSpUvlcDi0cOFCLVy4UJcuXZLb7ZYkud1utbS0SJI8Ho+ioqJ8t4uKipLH4/EdezdOp0Mu18PB/QUGGPYrcNjLwGI/AysQ+2lLOH7zm98oMjJSly5dUm5urmJjY+95rGVZd6w5HI5uf77Xa6m19Vq3x4wZM8K/YQeJnvarJ+znv7GXgcV+BlZ3++nvXtlyqioyMlKSNHr0aKWkpKi+vl6jR4/2nYJqbm7WqFGjJN1+hNHU1OS7bVNTU7ePNgAAwdXr4bh27Zra29t9X7/77ruKi4tTcnKyysrKJEllZWWaPXu2JPnWLcvSiRMnNGLECMIBADbq9VNVly5d0urVqyVJXq9X8+bN06xZsxQfH681a9Zo7969Gjt2rIqLiyVJSUlJqq6uVkpKioYNG6atW7f29sgAgP/Q6+GIjo7W73//+zvWR44cqdLS0jvWHQ6HNm7c2BujAQD80GdejgsA6B8IBwDACOEAABghHAAAI4QDAGCEcAAAjBAOAIARwgEAMEI4AABGCAcAwAjhAAAYIRwAACOEAwBghHAAAIwQDgCAEcIBADBCOAAARggHAMAI4QAAGCEcAAAjhAMAYIRwAACMEA4AgBHCAQAwQjgAAEYIBwDACOEAABghHAAAI/0mHDU1NUpLS1NKSopKSkrsHgcABq1+EQ6v16uioiLt2rVLFRUVOnDggM6ePWv3WAAwKPWLcNTX1ysmJkbR0dEKCwtTenq6qqqq7B4LAAalfhEOj8ejqKgo3/eRkZHyeDw2TgQAg1eo3QP4w7KsO9YcDsc9jx8yxKkxY0b0+HOPvfS/DzTXQOLPfvXk0cKTAZik/wvEXr6b924AJhkYArGfSTXVAZhkYAjEfvaLRxxRUVFqamryfe/xeOR2u22cCAAGr34Rjvj4eDU2NurcuXPq6OhQRUWFkpOT7R4LAAalfnGqKjQ0VIWFhVq2bJm8Xq+ys7MVFxdn91gAMCg5rLs9gQAAwD30i1NVAIC+g3AAAIwQDptwCZXAWbdunRISEjRv3jy7R+n3Lly4oMWLF2vu3LlKT09XaWmp3SP1azdu3FBOTo6+/vWvKz09XT/5yU/sHikwLPS6mzdvWrNnz7Y++ugj68aNG1ZGRoZ15swZu8fqt+rq6qyGhgYrPT3d7lH6PY/HYzU0NFiWZVmffPKJlZqayt/mA7h165bV3t5uWZZldXR0WDk5Odbx48dtnurB8YjDBlxCJbCmT5+uiIgIu8cYENxutyZOnChJCg8PV2xsLFdpeAAOh0PDhw+XJN28eVM3b97s9s3L/QXhsAGXUEF/cP78eZ06dUpTpkyxe5R+zev1KjMzUzNmzNCMGTMGxH4SDhtYhpdQAXrb1atXlZ+fr/Xr1ys8PNzucfo1p9Op8vJyVVdXq76+Xn/729/sHumBEQ4bcAkV9GWdnZ3Kz89XRkaGUlNT7R5nwPjMZz6jxx9/XEePHrV7lAdGOGzAJVTQV1mWpQ0bNig2Nla5ubl2j9PvtbS06MqVK5KkTz/9VH/6058UGxtr81QPjneO26S6ulpbt271XUJl5cqVdo/UbxUUFKiurk6XL1/W6NGjlZeXpwULFtg9Vr/0/vvva9GiRZowYYJCQm7/f2VBQYGSkpJsnqx/On36tNauXSuv1yvLsjRnzhw9++yzdo/1wAgHAMAIp6oAAEYIBwDACOEAABghHAAAI4QDAGCkX3wCIGCnn//85zpw4IBCQkIUEhKioqIivfzyy2pubtbQoUMlSTExMb4rn5aVlWnXrl2yLEuWZSk7O1vnz5/XBx98oM7OTp0/f16f+9znJEkrV67UkSNH9JWvfEVz5sxRR0eHXnrpJb3zzjsKCQnR5z//eW3cuNF3iZovfOELys3N1dq1ayVJr7/+uq5du6a8vDwbdgaDFeEAunH8+HEdOXJE+/fvV1hYmFpaWtTZ2SlJevnllxUfH9/l+OrqapWWlur1119XZGSkbty4ofLycm3cuFHS7es/rVixQuXl5b7bHDlyxPf1j370I129elWHDh2S0+nU22+/rWeffVZ79uyRw+FQWFiYKisrtXz5co0aNSr4GwDcBaeqgG5cvHhRI0eOVFhYmCRp1KhRioyMvOfxJSUl+u53v+s75qGHHtI3vvENv+7r+vXr2rdvn9avXy+n0ylJys7OVlhYmN577z1JUmhoqBYuXMjnZMBWhAPoRmJioi5cuKC0tDRt2rRJdXV1vn974YUXlJmZqczMTO3YsUOSdObMGU2aNOm+7usf//iHxo4de8dFBSdNmqQzZ874vl+0aJH+8Ic/6JNPPrmv+wEeFKeqgG4MHz5c+/bt0/vvv68///nPeu655/T8889Luvupqgd1t6skW5bVZT08PFyZmZl68803fc+xAL2JRxxAD5xOpx5//HHl5+fr+9//viorK+957Pjx49XQ0HBf9/Poo4/qn//8p9rb27us/+Uvf9H48eO7rC1ZskRvv/22rl+/fl/3BTwIwgF04+9//7saGxt93586dUqf/exn73n8t7/9bb300ku6ePGiJKmjo0NvvvmmX/f18MMPKysrS9u3b5fX65V0+xVa169f15e//OUux7pcLs2ZM0d79+41/I2AB8epKqAb165d05YtW3TlyhU5nU7FxMSoqKhI3/nOd/TCCy/4ThWNHDlSv/zlL5WUlKR//etfys3N9Z1iys7O9vv+nn/+ee3YsUNpaWkKCQlRbGysfvrTn971FNYzzzyj3bt3B+x3BfzF1XEBAEY4VQUAMEI4AABGCAcAwAjhAAAYIRwAACOEAwBghHAAAIwQDgCAkf8DRfQC6mDeg0cAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.countplot(x='SECTION', data=train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2019-08-22 03:16:22--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip\n",
      "Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.69.128, 2607:f8b0:4001:c08::80\n",
      "Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.69.128|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 407727028 (389M) [application/zip]\n",
      "Saving to: ‘uncased_L-12_H-768_A-12.zip’\n",
      "\n",
      "uncased_L-12_H-768_ 100%[===================>] 388.84M   146MB/s    in 2.7s    \n",
      "\n",
      "2019-08-22 03:16:25 (146 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]\n",
      "\n",
      "--2019-08-22 03:16:26--  https://raw.githubusercontent.com/google-research/bert/master/modeling.py\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 37922 (37K) [text/plain]\n",
      "Saving to: ‘modeling.py’\n",
      "\n",
      "modeling.py         100%[===================>]  37.03K  --.-KB/s    in 0.01s   \n",
      "\n",
      "2019-08-22 03:16:26 (3.61 MB/s) - ‘modeling.py’ saved [37922/37922]\n",
      "\n",
      "--2019-08-22 03:16:26--  https://raw.githubusercontent.com/google-research/bert/master/optimization.py\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 6258 (6.1K) [text/plain]\n",
      "Saving to: ‘optimization.py’\n",
      "\n",
      "optimization.py     100%[===================>]   6.11K  --.-KB/s    in 0s      \n",
      "\n",
      "2019-08-22 03:16:26 (83.1 MB/s) - ‘optimization.py’ saved [6258/6258]\n",
      "\n",
      "--2019-08-22 03:16:27--  https://raw.githubusercontent.com/google-research/bert/master/run_classifier.py\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 34783 (34K) [text/plain]\n",
      "Saving to: ‘run_classifier.py’\n",
      "\n",
      "run_classifier.py   100%[===================>]  33.97K  --.-KB/s    in 0.01s   \n",
      "\n",
      "2019-08-22 03:16:27 (3.27 MB/s) - ‘run_classifier.py’ saved [34783/34783]\n",
      "\n",
      "--2019-08-22 03:16:28--  https://raw.githubusercontent.com/google-research/bert/master/tokenization.py\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 12257 (12K) [text/plain]\n",
      "Saving to: ‘tokenization.py’\n",
      "\n",
      "tokenization.py     100%[===================>]  11.97K  --.-KB/s    in 0s      \n",
      "\n",
      "2019-08-22 03:16:28 (94.7 MB/s) - ‘tokenization.py’ saved [12257/12257]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip\n",
    "!wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py \n",
    "!wget https://raw.githubusercontent.com/google-research/bert/master/optimization.py \n",
    "!wget https://raw.githubusercontent.com/google-research/bert/master/run_classifier.py \n",
    "!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import modeling\n",
    "import optimization\n",
    "import run_classifier\n",
    "import tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "import zipfile\n",
    "folder = 'model_folder'\n",
    "with zipfile.ZipFile(\"uncased_L-12_H-768_A-12.zip\",\"r\") as zip_ref:\n",
    "    zip_ref.extractall(folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">> Model output directory: model_folder/outputs\n",
      ">>  BERT pretrained directory: model_folder/uncased_L-12_H-768_A-12\n"
     ]
    }
   ],
   "source": [
    "BERT_MODEL = 'uncased_L-12_H-768_A-12'\n",
    "BERT_PRETRAINED_DIR = f'{folder}/uncased_L-12_H-768_A-12'\n",
    "OUTPUT_DIR = f'{folder}/outputs'\n",
    "print(f'>> Model output directory: {OUTPUT_DIR}')\n",
    "print(f'>>  BERT pretrained directory: {BERT_PRETRAINED_DIR}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 0, 3, ..., 1, 0, 2])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train[\"SECTION\"].values.astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_examples(lines, set_type, labels=None):\n",
    "#Generate data for the BERT model\n",
    "    guid = f'{set_type}'\n",
    "    examples = []\n",
    "    if guid == 'train':\n",
    "        for line, label in zip(lines, labels):\n",
    "            text_a = line\n",
    "            label = str(label)\n",
    "            examples.append(\n",
    "              run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))\n",
    "    else:\n",
    "        for line in lines:\n",
    "            text_a = line\n",
    "            label = '0'\n",
    "            examples.append(\n",
    "              run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))\n",
    "    return examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import random\n",
    "random.seed(1)\n",
    "session_conf = tf.ConfigProto(intra_op_parallelism_threads=6, inter_op_parallelism_threads=5)\n",
    "from keras import backend"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Model Hyper Parameters\n",
    "TRAIN_BATCH_SIZE = 32\n",
    "EVAL_BATCH_SIZE = 32\n",
    "LEARNING_RATE = 1e-5\n",
    "NUM_TRAIN_EPOCHS = 5\n",
    "WARMUP_PROPORTION = 0.1\n",
    "MAX_SEQ_LENGTH = 200\n",
    "\n",
    "# Model configs\n",
    "SAVE_CHECKPOINTS_STEPS = 100000  \n",
    "ITERATIONS_PER_LOOP = 100000\n",
    "NUM_TPU_CORES = 8\n",
    "VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')\n",
    "CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')\n",
    "INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')\n",
    "DO_LOWER_CASE = BERT_MODEL.startswith('uncased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_list = [0,1,2,3]\n",
    "tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)\n",
    "train_examples = create_examples(train[\"STORY\"].values, 'train', labels=train[\"SECTION\"].values.astype(int))\n",
    "\n",
    "tpu_cluster_resolver = None #Since training will happen on GPU, we won't need a cluster resolver\n",
    "#TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator.\n",
    "run_config = tf.contrib.tpu.RunConfig(\n",
    "    cluster=tpu_cluster_resolver,\n",
    "    model_dir=OUTPUT_DIR,\n",
    "    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,\n",
    "    tpu_config=tf.contrib.tpu.TPUConfig(\n",
    "        iterations_per_loop=ITERATIONS_PER_LOOP,\n",
    "        num_shards=NUM_TPU_CORES,\n",
    "        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_train_steps = int(\n",
    "    len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)\n",
    "num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)\n",
    "\n",
    "model_fn = run_classifier.model_fn_builder(\n",
    "    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),\n",
    "    num_labels=len(label_list),\n",
    "    init_checkpoint=INIT_CHECKPOINT,\n",
    "    learning_rate=LEARNING_RATE,\n",
    "    num_train_steps=num_train_steps,\n",
    "    num_warmup_steps=num_warmup_steps,\n",
    "    use_tpu=False, #If False training will fall on CPU or GPU, depending on what is available  \n",
    "    use_one_hot_embeddings=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator = tf.contrib.tpu.TPUEstimator(\n",
    "    use_tpu=False, #If False training will fall on CPU or GPU, depending on what is available \n",
    "    model_fn=model_fn,\n",
    "    config=run_config,\n",
    "    train_batch_size=TRAIN_BATCH_SIZE,\n",
    "    eval_batch_size=EVAL_BATCH_SIZE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Please wait...\n",
      ">> Started training at 2019-08-22 03:21:33.974862 \n",
      "  Num examples = 7628\n",
      "  Batch size = 32\n",
      ">> Finished training at 2019-08-22 03:38:46.914279\n"
     ]
    }
   ],
   "source": [
    "import datetime\n",
    "print('Please wait...')\n",
    "train_features = run_classifier.convert_examples_to_features(train_examples,{'0':0,'1':1,'2':2, '3':3}, MAX_SEQ_LENGTH, tokenizer)\n",
    "print('>> Started training at {} '.format(datetime.datetime.now()))\n",
    "print('  Num examples = {}'.format(len(train_examples)))\n",
    "print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))\n",
    "tf.logging.info(\"  Num steps = %d\", num_train_steps)\n",
    "train_input_fn = run_classifier.input_fn_builder(\n",
    "    features=train_features,\n",
    "    seq_length=MAX_SEQ_LENGTH,\n",
    "    is_training=True,\n",
    "    drop_remainder=True)\n",
    "estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n",
    "print('>> Finished training at {}'.format(datetime.datetime.now()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "def input_fn_builder(features, seq_length, is_training, drop_remainder):\n",
    "    all_input_ids = []\n",
    "    all_input_mask = []\n",
    "    all_segment_ids = []\n",
    "    all_label_ids = []\n",
    "    \n",
    "    for feature in features:\n",
    "        all_input_ids.append(feature.input_ids)\n",
    "        all_input_mask.append(feature.input_mask)\n",
    "        all_segment_ids.append(feature.segment_ids)\n",
    "        all_label_ids.append(feature.label_id)\n",
    "    \n",
    "    def input_fn(params):\n",
    "        \"\"\"The actual input function.\"\"\"\n",
    "        print(params)\n",
    "        batch_size = 500\n",
    "\n",
    "        num_examples = len(features)\n",
    "\n",
    "        d = tf.data.Dataset.from_tensor_slices({\n",
    "            \"input_ids\":\n",
    "                tf.constant(\n",
    "                    all_input_ids, shape=[num_examples, seq_length],\n",
    "                    dtype=tf.int32),\n",
    "            \"input_mask\":\n",
    "                tf.constant(\n",
    "                    all_input_mask,\n",
    "                    shape=[num_examples, seq_length],\n",
    "                    dtype=tf.int32),\n",
    "            \"segment_ids\":\n",
    "                tf.constant(\n",
    "                    all_segment_ids,\n",
    "                    shape=[num_examples, seq_length],\n",
    "                    dtype=tf.int32),\n",
    "            \"label_ids\":\n",
    "                tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),\n",
    "    })\n",
    "\n",
    "        if is_training:\n",
    "            d = d.repeat()\n",
    "            d = d.shuffle(buffer_size=100)\n",
    "\n",
    "        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)\n",
    "        return d\n",
    "\n",
    "    return input_fn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "predict_examples = create_examples(test['STORY'].values, 'test')\n",
    "\n",
    "predict_features = run_classifier.convert_examples_to_features(\n",
    "    predict_examples, {'0':0,'1':1,'2':2,'3':3}, MAX_SEQ_LENGTH, tokenizer)\n",
    "\n",
    "predict_input_fn = input_fn_builder(\n",
    "    features=predict_features,\n",
    "    seq_length=MAX_SEQ_LENGTH,\n",
    "    is_training=False,\n",
    "    drop_remainder=False)\n",
    "\n",
    "result = estimator.predict(input_fn=predict_input_fn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{}\n"
     ]
    }
   ],
   "source": [
    "preds = []\n",
    "for prediction in result:\n",
    "      preds.append(np.argmax(prediction['probabilities']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SECTION</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   SECTION\n",
       "0        1\n",
       "1        2\n",
       "2        1\n",
       "3        0\n",
       "4        1"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub['SECTION'] = preds\n",
    "sub.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    1182\n",
       "2     823\n",
       "0     424\n",
       "3     319\n",
       "Name: SECTION, dtype: int64"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub['SECTION'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<a download=\"submission.csv\" href=\"data:text/csv;base64,U0VDVElPTgoxCjIKMQowCjEKMQoxCjIKMQoyCjAKMwoyCjEKMgoxCjMKMgozCjIKMgoyCjIKMAowCjIKMgozCjMKMAoxCjMKMgowCjIKMgoyCjIKMAoxCjAKMQozCjIKMgoyCjEKMQowCjEKMwoyCjEKMgoyCjAKMQoxCjAKMQoxCjIKMwoyCjEKMQoyCjAKMAoxCjEKMQoyCjAKMAoxCjEKMQoyCjIKMwowCjMKMAoyCjIKMgoyCjEKMQoxCjIKMQoxCjEKMQowCjIKMgoyCjEKMAoxCjMKMQoxCjIKMAoyCjMKMQoxCjIKMgoxCjIKMwozCjIKMQoxCjMKMAoyCjAKMwozCjMKMgoxCjEKMgoxCjAKMwoxCjEKMQoxCjIKMQowCjIKMQoyCjEKMgoyCjIKMQoyCjEKMQoxCjIKMQoxCjEKMQoyCjIKMQowCjAKMQoyCjAKMQoxCjIKMgoyCjIKMQoxCjEKMgoxCjIKMQoxCjMKMQoyCjIKMQozCjEKMQoyCjIKMQoyCjEKMQoxCjIKMQozCjEKMgoxCjEKMQoyCjEKMQoxCjMKMgowCjIKMgozCjEKMQoyCjEKMQoyCjAKMQowCjEKMQoxCjEKMQoxCjAKMAoxCjEKMwoxCjIKMgowCjIKMQoyCjIKMQoyCjEKMgoyCjEKMQozCjEKMQoyCjIKMQoxCjMKMAoxCjIKMgoyCjAKMgoyCjAKMQoxCjIKMQowCjIKMQoxCjIKMQozCjEKMQoxCjEKMQoyCjEKMQowCjMKMQoyCjIKMAoxCjIKMgoxCjEKMQoxCjIKMQowCjIKMQoyCjEKMgoxCjAKMgowCjEKMAoxCjEKMAoyCjMKMQozCjEKMQoyCjEKMwowCjIKMAoyCjEKMgoxCjIKMwoxCjAKMAoyCjAKMQoyCjEKMQowCjMKMQoyCjIKMwoyCjEKMgoxCjEKMQoyCjIKMQoxCjMKMgowCjMKMQoxCjIKMgozCjIKMQoxCjEKMQoyCjEKMAowCjIKMgoxCjEKMgoxCjEKMgoyCjIKMgowCjEKMQoxCjIKMAozCjMKMAoxCjEKMgozCjIKMQoxCjIKMQoyCjIKMgoyCjEKMQozCjIKMwowCjIKMQoyCjEKMQowCjEKMAozCjAKMgoxCjEKMQoxCjAKMQoxCjEKMAoxCjAKMQoyCjIKMgoyCjEKMQoxCjEKMQozCjAKMAoyCjIKMgoxCjEKMgoyCjEKMgowCjIKMQoxCjEKMgoyCjEKMQozCjMKMQozCjEKMgowCjEKMwowCjEKMQoyCjMKMAowCjMKMQoxCjEKMQoxCjEKMAoyCjIKMQowCjEKMQozCjMKMQoxCjMKMgoxCjEKMgozCjEKMAoyCjEKMAoyCjMKMQozCjAKMgoxCjEKMQoxCjIKMQoyCjEKMQoyCjEKMgoxCjIKMgowCjEKMQozCjEKMQoxCjIKMgozCjIKMgoxCjEKMgoyCjEKMQoxCjEKMgowCjMKMQoxCjEKMQoxCjMKMQoxCjMKMQoxCjIKMQoxCjMKMAoxCjEKMgoyCjAKMQozCjEKMgoxCjIKMgoxCjIKMAoxCjEKMgoxCjIKMQoxCjAKMQozCjEKMgoxCjIKMgoyCjIKMgoxCjEKMQoxCjIKMQoxCjIKMwozCjIKMQowCjEKMAoxCjIKMgoxCjEKMgoxCjEKMQoyCjMKMgoxCjEKMQoxCjIKMQowCjEKMwoxCjEKMQowCjEKMQoxCjAKMgoxCjAKMwoxCjEKMQozCjEKMgoyCjEKMgowCjEKMQoxCjMKMQoxCjMKMAoyCjAKMwowCjEKMQoxCjIKMgowCjEKMgoyCjEKMQoxCjEKMgozCjMKMgoyCjEKMgoyCjEKMAoxCjEKMQozCjEKMgoxCjEKMQoxCjEKMgoyCjIKMQoxCjIKMQozCjAKMgowCjAKMAoyCjIKMgoyCjAKMgoyCjMKMAoyCjEKMQozCjEKMQoyCjIKMwoxCjAKMQoxCjEKMQoyCjEKMgozCjMKMQowCjEKMAoxCjEKMgoyCjEKMgoxCjIKMQoxCjIKMgoyCjIKMAoyCjAKMQoyCjEKMQozCjIKMQoxCjIKMQoxCjIKMQoxCjEKMQoxCjIKMwoxCjAKMQoxCjEKMwoyCjMKMQozCjEKMAowCjEKMQoyCjEKMQoyCjAKMAozCjIKMQowCjEKMgoyCjIKMAoyCjEKMAozCjEKMgoyCjIKMAozCjAKMAowCjIKMgowCjEKMgozCjEKMgoyCjEKMQoyCjEKMwoxCjEKMQoyCjIKMQoxCjAKMgoyCjEKMAoyCjEKMQoyCjEKMQoxCjEKMQowCjAKMwowCjEKMAoxCjEKMQoxCjIKMgoxCjIKMgoyCjEKMAoyCjIKMgoyCjMKMgoxCjAKMQoxCjEKMgoxCjIKMQowCjMKMgozCjIKMQoyCjIKMQoxCjEKMQowCjEKMgozCjEKMAowCjEKMAoxCjEKMQozCjEKMQoxCjIKMQoxCjIKMQoxCjIKMQoxCjAKMAowCjEKMQoxCjIKMAoyCjMKMgowCjMKMQowCjMKMgoyCjAKMwoxCjAKMwozCjMKMAozCjAKMgoyCjAKMgozCjEKMQoxCjIKMQozCjIKMAozCjAKMQoyCjIKMQozCjIKMAoyCjAKMwoxCjMKMgoxCjEKMgoxCjAKMwoyCjEKMgowCjIKMQoyCjIKMQoxCjEKMwowCjEKMgoxCjIKMgoxCjIKMAoxCjIKMgozCjEKMQoxCjEKMAoxCjIKMAoyCjMKMgoxCjEKMQowCjMKMQoxCjIKMAoxCjMKMQoxCjEKMQowCjMKMQozCjEKMgoxCjAKMgozCjEKMgoyCjEKMAoxCjEKMAowCjEKMwoxCjEKMwoxCjIKMgoxCjIKMgoxCjAKMQoxCjMKMAoxCjIKMwoyCjAKMAoxCjAKMwoxCjIKMQozCjEKMgoyCjIKMQoxCjAKMwowCjEKMQoyCjEKMQoyCjIKMwowCjAKMQoxCjEKMQozCjEKMgoxCjEKMQoyCjAKMwoyCjEKMQoxCjIKMgoyCjEKMAowCjEKMgoyCjEKMQowCjIKMgowCjMKMwoyCjMKMQozCjIKMgoxCjEKMQoxCjIKMQoxCjEKMQoxCjIKMgoyCjIKMAowCjEKMgowCjIKMwowCjIKMQoxCjEKMQoxCjIKMwowCjMKMgoxCjIKMgozCjEKMgoyCjIKMgoyCjEKMgoyCjMKMQoxCjIKMAoxCjAKMQoyCjIKMgoyCjMKMwoxCjIKMQoxCjIKMQozCjIKMQoyCjAKMQoxCjEKMwoxCjEKMQowCjAKMgoxCjAKMgoxCjEKMQoxCjAKMgoxCjEKMwowCjEKMQoxCjMKMQoyCjAKMQoxCjIKMAoyCjIKMQoxCjIKMgoxCjEKMQoxCjMKMQoyCjEKMAoxCjIKMQowCjEKMAoxCjEKMgozCjEKMgoxCjEKMQoyCjEKMwoxCjIKMQoxCjIKMgoyCjEKMQoxCjEKMQoxCjIKMgoxCjEKMgoyCjAKMgoxCjEKMQowCjEKMgoxCjEKMQoyCjAKMQoyCjIKMwowCjIKMgoyCjAKMQoyCjIKMgoyCjEKMQowCjEKMgowCjEKMwoxCjAKMAowCjAKMwoxCjEKMQozCjEKMAoxCjEKMAoxCjIKMQoxCjAKMwoxCjIKMQowCjEKMgowCjEKMgoxCjMKMwoyCjEKMQoyCjEKMgozCjIKMQowCjMKMgoxCjAKMQoxCjIKMQoyCjEKMAoyCjIKMAoyCjMKMAoyCjIKMQozCjIKMAozCjMKMAoyCjMKMwoyCjAKMQoyCjAKMwoxCjAKMgoyCjAKMgoxCjIKMQoyCjIKMAoyCjEKMAowCjEKMQoxCjEKMQoxCjEKMQoyCjEKMQoxCjAKMQoxCjAKMQoyCjEKMgoxCjEKMgoyCjIKMwoyCjIKMQoxCjIKMQoxCjAKMQoyCjIKMAozCjIKMAoyCjIKMwoxCjAKMwowCjIKMQoxCjAKMAoxCjIKMQowCjIKMAoyCjEKMwozCjIKMgoyCjEKMQoxCjEKMQoxCjEKMQoxCjIKMQoxCjMKMQowCjEKMQoxCjIKMAoxCjEKMQoyCjIKMQoxCjEKMQoxCjIKMwoxCjIKMQozCjIKMgoxCjMKMQoxCjIKMgoyCjAKMAoyCjAKMgoxCjEKMgoxCjIKMQoxCjEKMgoxCjIKMwowCjMKMQozCjIKMQoyCjEKMgoxCjEKMQozCjEKMgoxCjEKMAowCjAKMQoxCjEKMgowCjEKMwoyCjIKMQowCjIKMgowCjAKMgoyCjIKMwoxCjEKMAoxCjMKMgoyCjIKMAowCjEKMgoyCjEKMgowCjEKMwoyCjEKMgoxCjEKMQoyCjEKMQoyCjIKMgowCjEKMQoyCjIKMQoxCjEKMQoxCjIKMgowCjEKMAowCjEKMgoyCjIKMAoyCjEKMQozCjAKMgoyCjMKMQowCjIKMQowCjIKMAoyCjEKMgozCjEKMQoxCjEKMQoxCjEKMAoxCjEKMwozCjEKMgowCjAKMgoyCjEKMQoyCjIKMQoxCjEKMgoxCjMKMQowCjIKMQoyCjEKMgoxCjIKMgowCjEKMgozCjIKMQoxCjMKMQowCjIKMwoyCjEKMAozCjEKMgoxCjEKMQoyCjMKMQozCjEKMQoxCjEKMAoxCjEKMgowCjAKMQoxCjIKMQoyCjEKMwoxCjEKMQoxCjIKMgoyCjEKMQoyCjEKMgowCjIKMQowCjEKMQoyCjEKMQoyCjMKMgoyCjIKMgoxCjAKMgoxCjEKMQowCjAKMgoxCjEKMAoyCjMKMQozCjIKMQowCjIKMQowCjEKMwoxCjAKMgoxCjIKMgoxCjEKMAoxCjEKMQoyCjIKMQoxCjIKMgozCjIKMQowCjIKMwoxCjEKMQowCjAKMQoxCjIKMAozCjEKMgowCjIKMAoxCjIKMQowCjEKMgoyCjIKMwoxCjEKMAoxCjMKMAoyCjIKMwoxCjIKMQoyCjIKMAozCjEKMQoyCjMKMwoxCjMKMgozCjIKMgoyCjEKMQoyCjEKMAoyCjIKMQoxCjMKMQozCjIKMAoyCjEKMwoxCjAKMQowCjEKMgoxCjIKMgoxCjAKMQozCjEKMQozCjEKMAoxCjEKMAowCjAKMAoyCjIKMQoyCjMKMAoxCjEKMgoyCjIKMQoyCjEKMQoxCjIKMwozCjEKMQoyCjIKMQowCjEKMgoxCjEKMQowCjEKMgowCjAKMwoyCjIKMQoxCjEKMAoxCjEKMAoxCjIKMgoxCjEKMQoxCjAKMgoyCjMKMQoyCjIKMwoyCjEKMwoyCjEKMQoxCjEKMQoyCjIKMQowCjAKMAoyCjAKMQoyCjAKMwoyCjMKMQoyCjMKMgoxCjIKMQoxCjMKMwoyCjEKMgoxCjAKMgoxCjIKMgozCjEKMQoxCjAKMgowCjIKMQoyCjEKMgoxCjEKMgoyCjEKMQoyCjIKMgozCjEKMQoyCjAKMgozCjEKMQoxCjMKMQoyCjEKMQoyCjEKMgowCjIKMgoxCjIKMgoyCjAKMAowCjAKMQozCjEKMgoxCjIKMgoyCjIKMgowCjIKMQoyCjMKMQoxCjMKMAowCjEKMQoyCjEKMgoxCjEKMQoyCjEKMgozCjIKMgozCjEKMgozCjEKMgoxCjIKMQoxCjEKMQoyCjIKMQoxCjAKMQozCjEKMgoyCjAKMgoyCjAKMgoxCjMKMQoxCjMKMQowCjEKMgoxCjAKMwoyCjEKMQozCjEKMQozCjAKMgoxCjMKMwowCjMKMQowCjEKMgowCjMKMAowCjMKMQoyCjIKMQoyCjAKMwowCjEKMQoxCjIKMgowCjEKMgoyCjAKMAoxCjEKMQowCjEKMAozCjMKMAoxCjIKMAowCjEKMAoyCjIKMAowCjIKMQowCjMKMQoxCjEKMwoxCjEKMQoyCjEKMQozCjEKMQozCjEKMAoxCjEKMAoxCjIKMQowCjIKMQoyCjIKMAoxCjEKMQowCjIKMQoxCjIKMgoyCjEKMAoyCjAKMQoyCjIKMAoxCjEKMgowCjAKMQoyCjEKMQowCjEKMgowCjEKMQowCjEKMQowCjAKMgoyCjIKMgoxCjIKMwoxCjEKMgoxCjMKMgoxCjEKMQowCjEKMQoxCjEKMgoxCjIKMQoxCjEKMQowCjEKMQowCjEKMwoyCjAKMAozCjEKMgowCjEKMwoyCjIKMgoxCjIKMwoxCjIKMgoxCjIKMgozCjIKMQoxCjAKMQozCjEKMgozCjEKMQoxCjAKMAoxCjIKMQowCjAKMgoyCjAKMQozCjEKMAoxCjEKMQoxCjMKMQoyCjMKMgoxCjEKMQoyCjAKMQoxCjEKMQoxCjEKMgozCjIKMQoyCjIKMgowCjIKMAoyCjIKMgoyCjMKMQoyCjEKMQowCjEKMgozCjIKMgowCjEKMgoxCjMKMAoxCjEKMQoyCjIKMgoxCjEKMgowCjEKMAoxCjIKMQoxCjAKMAowCjEKMgoyCjEKMQoxCjAKMQowCjEKMQoyCjEKMAoyCjEKMQozCjAKMwoyCjEKMgoxCjEKMQozCjEKMgozCjMKMQozCjIKMQoxCjEKMQoxCjEKMgowCjEKMAoxCjIKMgoxCjMKMgowCjEKMQoyCjMKMQoyCjEKMwoyCjIKMQoxCjEKMwozCjEKMgoxCjEKMgoxCjMKMgoyCjEKMwozCjIKMgoyCjEKMQoxCjAKMgowCjIKMQoxCjIKMwoxCjIKMQoxCjIKMgoyCjMKMgoyCjAKMAozCjEKMQowCjEKMgoxCjEKMQoyCjEKMgoxCjIKMQoxCjEKMwoyCjIKMAoxCjAKMQoxCjIKMwowCjEKMQoxCjEKMwoxCjEKMQowCjAKMgoxCjEKMQoxCjIKMQoxCjEKMQoyCjIKMAoyCjEKMwoyCjEKMQoxCjIKMQoyCjEKMwoxCjEKMQoxCjAKMgowCjIKMgoxCjIKMwoyCjIKMAoyCjEKMQoyCjAKMwoyCjMKMwowCjIKMgoyCjEKMwozCjEKMQozCjIKMAoxCjMKMgowCjAKMgozCjEKMgoxCjEKMQoxCjEKMgoyCjEKMgozCjEKMwoxCjEKMgoxCjEKMgoxCjEKMQoxCjEKMgoxCjEKMwoxCjAKMgoxCjEKMwowCjAKMQoxCjIKMQoyCjEKMgoxCjAKMgowCjEKMQozCjEKMgoyCjEKMQozCjEKMQoxCjEKMwoxCjEKMwoxCjEKMgowCjIKMwoxCjAKMQoyCjIKMQoyCjEKMQozCjEKMwoxCjAKMwowCjEKMQozCjMKMAowCjEKMQoxCjAKMQo=\" target=\"_blank\">Download CSV file</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import HTML\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import base64\n",
    "\n",
    "def create_download_link(df, title = \"Download CSV file\", filename = \"submission.csv\"):  \n",
    "    csv = df.to_csv(index=False)\n",
    "    b64 = base64.b64encode(csv.encode())\n",
    "    payload = b64.decode()\n",
    "    html = '<a download=\"{filename}\" href=\"data:text/csv;base64,{payload}\" target=\"_blank\">{title}</a>'\n",
    "    html = html.format(payload=payload,title=title,filename=filename)\n",
    "    return HTML(html)\n",
    "\n",
    "create_download_link(sub)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
